# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns
# importing the dataset
sms_set = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])
# knowing the dataset
sms_set.head()
Label | SMS | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
# checking null values
sms_set.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5572 entries, 0 to 5571 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Label 5572 non-null object 1 SMS 5572 non-null object dtypes: object(2) memory usage: 87.2+ KB
# counting the labels
sms_set['Label'].value_counts()
ham 4825 spam 747 Name: Label, dtype: int64
# counting the percentage of the labels
sms_set['Label'].value_counts(normalize=True) * 100
ham 86.593683 spam 13.406317 Name: Label, dtype: float64
# randomize the dataset
data_randomized = sms_set.sample(frac=1, random_state=1)
# calculate index for split
training_test_index = round(len(data_randomized) * 0.8)
# training/Test split
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)
# drop parameter to avoid the old index being added as a column.
# checking the training set
training_set['Label'].value_counts(normalize=True) * 100
ham 86.54105 spam 13.45895 Name: Label, dtype: float64
# checking the test set
test_set['Label'].value_counts(normalize=True) * 100
ham 86.804309 spam 13.195691 Name: Label, dtype: float64
# removing punctuation and turning lower case the words
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()
C:\Users\Suporte\AppData\Local\Temp/ipykernel_1748/2879400168.py:2: FutureWarning: The default value of regex will change from True to False in a future version. training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
Label | SMS | |
---|---|---|
0 | ham | yep by the pretty sculpture |
1 | ham | yes princess are you going to make me moan |
2 | ham | welp apparently he retired |
3 | ham | havent |
4 | ham | i forgot 2 ask ü all smth there s a card on ... |
# transforming each message from the SMS column into a list
training_set['SMS'] = training_set['SMS'].str.split()
training_set.head()
Label | SMS | |
---|---|---|
0 | ham | [yep, by, the, pretty, sculpture] |
1 | ham | [yes, princess, are, you, going, to, make, me,... |
2 | ham | [welp, apparently, he, retired] |
3 | ham | [havent] |
4 | ham | [i, forgot, 2, ask, ü, all, smth, there, s, a,... |
# crating a vocabulary list with the unique words of the column
vocabulary = []
for m in training_set['SMS']:
for w in m:
vocabulary.append(w)
vocabulary = list(set(vocabulary))
# number of unique words
len(vocabulary)
7783
# creating a dictionary with the word counts per sms
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}
for index, sms in enumerate(training_set['SMS']):
for word in sms:
word_counts_per_sms[word][index]+=1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()
trouser | education | usmle | violated | miss | 169 | tag | tcr | realy | 01223585334 | ... | worlds | diapers | hmph | edison | adventure | southern | 2 | name1 | kickoff | worry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
5 rows × 7783 columns
# concatenating the word_counts with the training_set
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()
Label | SMS | trouser | education | usmle | violated | miss | 169 | tag | tcr | ... | worlds | diapers | hmph | edison | adventure | southern | 2 | name1 | kickoff | worry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ham | [yep, by, the, pretty, sculpture] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | ham | [yes, princess, are, you, going, to, make, me,... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | ham | [welp, apparently, he, retired] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | ham | [havent] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | ham | [i, forgot, 2, ask, ü, all, smth, there, s, a,... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
5 rows × 7785 columns
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']
# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)
# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()
# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
# N_Vocabulary
n_vocabulary = len(vocabulary)
# Laplace smoothing
alpha = 1
# initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}
# calculate parameters
for word in vocabulary:
n_word_given_spam = spam_messages[word].sum() # spam_messages already defined above
p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
parameters_spam[word] = p_word_given_spam
n_word_given_ham = ham_messages[word].sum() # ham_messages already defined above
p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
parameters_ham[word] = p_word_given_ham
# importing regular expressions
import re
# function of the classificator
def classify(message):
"""
Classifies the given message based on the parameters previouly calculated and compares
the two values and classifies the message as spam or ham, or requires
human classification.
Arg: message to be classified (str).
Returns: classification of the message (str).
"""
message = re.sub('\W', ' ', message)
message = message.lower()
message = message.split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
for word in message:
if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
if p_ham_given_message > p_spam_given_message:
return 'ham'
elif p_spam_given_message > p_ham_given_message:
return 'spam'
else:
return 'needs human classification'
# testing the classification
print(test_set['SMS'][0])
print(classify(test_set['SMS'][0]))
print(test_set['SMS'][30])
print(classify(test_set['SMS'][30]))
Later i guess. I needa do mcat study too. ham Get your garden ready for summer with a FREE selection of summer bulbs and seeds worth £33:50 only with The Scotsman this Saturday. To stop go2 notxt.co.uk spam
# creating the new column 'Predicted' with the results of the classification
test_set['Predicted'] = test_set['SMS'].apply(classify)
test_set.head()
Label | SMS | Predicted | |
---|---|---|---|
0 | ham | Later i guess. I needa do mcat study too. | ham |
1 | ham | But i haf enuff space got like 4 mb... | ham |
2 | spam | Had your mobile 10 mths? Update to latest Oran... | spam |
3 | ham | All sounds good. Fingers . Makes it difficult ... | ham |
4 | ham | All done, all handed in. Don't know if mega sh... | ham |
# Calculating the accuracy of the spam filter
correct = 0
total = len(test_set) # number of sms in the test set
for row in test_set.iterrows():
if row[1]['Predicted']==row[1]['Label']:
correct+=1
accuracy = correct/total*100
print(correct)
print(total)
print(round(accuracy, 2))
1100 1114 98.74
# classificating the wrong results
false_spam = test_set[(test_set['Predicted']=='spam')&(test_set['Label']=='ham')]
false_ham = test_set[(test_set['Predicted']=='ham')&(test_set['Label']=='spam')]
needs_human_classification = test_set[test_set['Predicted']=='needs human classification']
# checking 'false spam'
print(false_spam)
Label SMS Predicted 152 ham Unlimited texts. Limited minutes. spam 159 ham 26th OF JULY spam 284 ham Nokia phone is lovly.. spam 302 ham No calls..messages..missed calls spam 319 ham We have sent JD for Customer Service cum Accou... spam
# checking 'false ham'
print(false_ham)
Label SMS Predicted 114 spam Not heard from U4 a while. Call me now am here... ham 135 spam More people are dogging in your area now. Call... ham 504 spam Oh my god! I've found your number again! I'm s... ham 546 spam Hi babe its Chloe, how r u? I was smashed on s... ham 741 spam 0A$NETWORKS allow companies to bill for SMS, s... ham 876 spam RCT' THNQ Adrian for U text. Rgds Vatian ham 885 spam 2/2 146tf150p ham 953 spam Hello. We need some posh birds and chaps to us... ham
# checking false 'needs human classification'
print(needs_human_classification)
Label SMS \ 293 ham A Boy loved a gal. He propsd bt she didnt mind... Predicted 293 needs human classification
# make a copy of the set
skl_set = sms_set.copy()
# converting the labels to numeric
skl_set['Label'] = np.where(skl_set['Label']=='spam',1, 0)
skl_set.head()
Label | SMS | |
---|---|---|
0 | 0 | Go until jurong point, crazy.. Available only ... |
1 | 0 | Ok lar... Joking wif u oni... |
2 | 1 | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | 0 | U dun say so early hor... U c already then say... |
4 | 0 | Nah I don't think he goes to usf, he lives aro... |
# importing the methods, functions and classes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(skl_set['SMS'],
skl_set['Label'],
random_state=0,
test_size=0.2)
# extracting features
vectorizer = CountVectorizer(token_pattern='\W').fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape
(4457, 45)
# instantiate the model and fit the trainng data
model = MultinomialNB(alpha=1)
model.fit(X_train_vectorized, Y_train)
MultinomialNB(alpha=1)
# predict and measure the accuracy
predictions = model.predict(vectorizer.transform(X_test))
print("Accuracy:", 100 * sum(predictions == Y_test) / len(predictions), '%')
Accuracy: 94.17040358744394 %
# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(skl_set['SMS'],
skl_set['Label'],
random_state=0)
# extracting features
vectorizer = CountVectorizer(ngram_range=(1, 2)).fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape
# instantiate the model and fit the trainng data
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, Y_train)
# predict and measure the accuracy
predictions = model.predict(vectorizer.transform(X_test))
print("Accuracy:", 100 * sum(predictions == Y_test) / len(predictions), '%')
Accuracy: 98.77961234745155 %